// Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
//
// Licensed under the Apache License 2.0 (the "License").  You may not use
// this file except in compliance with the License.  You can obtain a copy
// in the file LICENSE in the source distribution or at
// https://www.openssl.org/source/license.html
//
// This module implements support for Armv8 SM3 instructions

// $output is the last argument if it looks like a file (it has an extension)
// $flavour is the first argument if it doesn't look like a file
#include "arm_arch.h"
.text
.globl  ossl_hwsm3_block_data_order
.type   ossl_hwsm3_block_data_order,%function
.align  5
ossl_hwsm3_block_data_order:
        AARCH64_VALID_CALL_TARGET
        // load state
        ld1     {v5.4s,v6.4s}, [x0]
        rev64   v5.4s, v5.4s
        rev64   v6.4s, v6.4s
        ext     v5.16b, v5.16b, v5.16b, #8
        ext     v6.16b, v6.16b, v6.16b, #8

        adr     x8, .Tj
        ldp     s16, s17, [x8]

.Loop:
        // load input
        ld1     {v0.16b,v1.16b,v2.16b,v3.16b}, [x1], #64
        sub     w2, w2, #1

        mov     v18.16b, v5.16b
        mov     v19.16b, v6.16b

#ifndef __ARMEB__
        rev32   v0.16b, v0.16b
        rev32   v1.16b, v1.16b
        rev32   v2.16b, v2.16b
        rev32   v3.16b, v3.16b
#endif

        ext     v20.16b, v16.16b, v16.16b, #4
        // s4 = w7  | w8  | w9  | w10
        ext     v4.16b, v1.16b, v2.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v0.16b, v1.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v2.16b, v3.16b, #8
.inst   0xce63c004      //sm3partw1 v4.4s, v0.4s, v3.4s
.inst   0xce76c6e4      //sm3partw2 v4.4s, v23.4s, v22.4s
        eor     v22.16b, v0.16b, v1.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5682e5      //sm3tt1a v5.4s, v23.4s, v22.4s[0]
.inst   0xce408ae6      //sm3tt2a v6.4s, v23.4s, v0.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5692e5      //sm3tt1a v5.4s, v23.4s, v22.4s[1]
.inst   0xce409ae6      //sm3tt2a v6.4s, v23.4s, v0.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a2e5      //sm3tt1a v5.4s, v23.4s, v22.4s[2]
.inst   0xce40aae6      //sm3tt2a v6.4s, v23.4s, v0.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b2e5      //sm3tt1a v5.4s, v23.4s, v22.4s[3]
.inst   0xce40bae6      //sm3tt2a v6.4s, v23.4s, v0.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v0.16b, v2.16b, v3.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v1.16b, v2.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v3.16b, v4.16b, #8
.inst   0xce64c020      //sm3partw1 v0.4s, v1.4s, v4.4s
.inst   0xce76c6e0      //sm3partw2 v0.4s, v23.4s, v22.4s
        eor     v22.16b, v1.16b, v2.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5682e5      //sm3tt1a v5.4s, v23.4s, v22.4s[0]
.inst   0xce418ae6      //sm3tt2a v6.4s, v23.4s, v1.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5692e5      //sm3tt1a v5.4s, v23.4s, v22.4s[1]
.inst   0xce419ae6      //sm3tt2a v6.4s, v23.4s, v1.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a2e5      //sm3tt1a v5.4s, v23.4s, v22.4s[2]
.inst   0xce41aae6      //sm3tt2a v6.4s, v23.4s, v1.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b2e5      //sm3tt1a v5.4s, v23.4s, v22.4s[3]
.inst   0xce41bae6      //sm3tt2a v6.4s, v23.4s, v1.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v1.16b, v3.16b, v4.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v2.16b, v3.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v4.16b, v0.16b, #8
.inst   0xce60c041      //sm3partw1 v1.4s, v2.4s, v0.4s
.inst   0xce76c6e1      //sm3partw2 v1.4s, v23.4s, v22.4s
        eor     v22.16b, v2.16b, v3.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5682e5      //sm3tt1a v5.4s, v23.4s, v22.4s[0]
.inst   0xce428ae6      //sm3tt2a v6.4s, v23.4s, v2.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5692e5      //sm3tt1a v5.4s, v23.4s, v22.4s[1]
.inst   0xce429ae6      //sm3tt2a v6.4s, v23.4s, v2.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a2e5      //sm3tt1a v5.4s, v23.4s, v22.4s[2]
.inst   0xce42aae6      //sm3tt2a v6.4s, v23.4s, v2.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b2e5      //sm3tt1a v5.4s, v23.4s, v22.4s[3]
.inst   0xce42bae6      //sm3tt2a v6.4s, v23.4s, v2.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v2.16b, v4.16b, v0.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v3.16b, v4.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v0.16b, v1.16b, #8
.inst   0xce61c062      //sm3partw1 v2.4s, v3.4s, v1.4s
.inst   0xce76c6e2      //sm3partw2 v2.4s, v23.4s, v22.4s
        eor     v22.16b, v3.16b, v4.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5682e5      //sm3tt1a v5.4s, v23.4s, v22.4s[0]
.inst   0xce438ae6      //sm3tt2a v6.4s, v23.4s, v3.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5692e5      //sm3tt1a v5.4s, v23.4s, v22.4s[1]
.inst   0xce439ae6      //sm3tt2a v6.4s, v23.4s, v3.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a2e5      //sm3tt1a v5.4s, v23.4s, v22.4s[2]
.inst   0xce43aae6      //sm3tt2a v6.4s, v23.4s, v3.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b2e5      //sm3tt1a v5.4s, v23.4s, v22.4s[3]
.inst   0xce43bae6      //sm3tt2a v6.4s, v23.4s, v3.4s[3]
        ext     v20.16b, v17.16b, v17.16b, #4
        // s4 = w7  | w8  | w9  | w10
        ext     v3.16b, v0.16b, v1.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v4.16b, v0.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v1.16b, v2.16b, #8
.inst   0xce62c083      //sm3partw1 v3.4s, v4.4s, v2.4s
.inst   0xce76c6e3      //sm3partw2 v3.4s, v23.4s, v22.4s
        eor     v22.16b, v4.16b, v0.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce448ee6      //sm3tt2b v6.4s, v23.4s, v4.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce449ee6      //sm3tt2b v6.4s, v23.4s, v4.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce44aee6      //sm3tt2b v6.4s, v23.4s, v4.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce44bee6      //sm3tt2b v6.4s, v23.4s, v4.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v4.16b, v1.16b, v2.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v0.16b, v1.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v2.16b, v3.16b, #8
.inst   0xce63c004      //sm3partw1 v4.4s, v0.4s, v3.4s
.inst   0xce76c6e4      //sm3partw2 v4.4s, v23.4s, v22.4s
        eor     v22.16b, v0.16b, v1.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce408ee6      //sm3tt2b v6.4s, v23.4s, v0.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce409ee6      //sm3tt2b v6.4s, v23.4s, v0.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce40aee6      //sm3tt2b v6.4s, v23.4s, v0.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce40bee6      //sm3tt2b v6.4s, v23.4s, v0.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v0.16b, v2.16b, v3.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v1.16b, v2.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v3.16b, v4.16b, #8
.inst   0xce64c020      //sm3partw1 v0.4s, v1.4s, v4.4s
.inst   0xce76c6e0      //sm3partw2 v0.4s, v23.4s, v22.4s
        eor     v22.16b, v1.16b, v2.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce418ee6      //sm3tt2b v6.4s, v23.4s, v1.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce419ee6      //sm3tt2b v6.4s, v23.4s, v1.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce41aee6      //sm3tt2b v6.4s, v23.4s, v1.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce41bee6      //sm3tt2b v6.4s, v23.4s, v1.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v1.16b, v3.16b, v4.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v2.16b, v3.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v4.16b, v0.16b, #8
.inst   0xce60c041      //sm3partw1 v1.4s, v2.4s, v0.4s
.inst   0xce76c6e1      //sm3partw2 v1.4s, v23.4s, v22.4s
        eor     v22.16b, v2.16b, v3.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce428ee6      //sm3tt2b v6.4s, v23.4s, v2.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce429ee6      //sm3tt2b v6.4s, v23.4s, v2.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce42aee6      //sm3tt2b v6.4s, v23.4s, v2.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce42bee6      //sm3tt2b v6.4s, v23.4s, v2.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v2.16b, v4.16b, v0.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v3.16b, v4.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v0.16b, v1.16b, #8
.inst   0xce61c062      //sm3partw1 v2.4s, v3.4s, v1.4s
.inst   0xce76c6e2      //sm3partw2 v2.4s, v23.4s, v22.4s
        eor     v22.16b, v3.16b, v4.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce438ee6      //sm3tt2b v6.4s, v23.4s, v3.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce439ee6      //sm3tt2b v6.4s, v23.4s, v3.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce43aee6      //sm3tt2b v6.4s, v23.4s, v3.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce43bee6      //sm3tt2b v6.4s, v23.4s, v3.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v3.16b, v0.16b, v1.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v4.16b, v0.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v1.16b, v2.16b, #8
.inst   0xce62c083      //sm3partw1 v3.4s, v4.4s, v2.4s
.inst   0xce76c6e3      //sm3partw2 v3.4s, v23.4s, v22.4s
        eor     v22.16b, v4.16b, v0.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce448ee6      //sm3tt2b v6.4s, v23.4s, v4.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce449ee6      //sm3tt2b v6.4s, v23.4s, v4.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce44aee6      //sm3tt2b v6.4s, v23.4s, v4.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce44bee6      //sm3tt2b v6.4s, v23.4s, v4.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v4.16b, v1.16b, v2.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v0.16b, v1.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v2.16b, v3.16b, #8
.inst   0xce63c004      //sm3partw1 v4.4s, v0.4s, v3.4s
.inst   0xce76c6e4      //sm3partw2 v4.4s, v23.4s, v22.4s
        eor     v22.16b, v0.16b, v1.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce408ee6      //sm3tt2b v6.4s, v23.4s, v0.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce409ee6      //sm3tt2b v6.4s, v23.4s, v0.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce40aee6      //sm3tt2b v6.4s, v23.4s, v0.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce40bee6      //sm3tt2b v6.4s, v23.4s, v0.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v0.16b, v2.16b, v3.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v1.16b, v2.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v3.16b, v4.16b, #8
.inst   0xce64c020      //sm3partw1 v0.4s, v1.4s, v4.4s
.inst   0xce76c6e0      //sm3partw2 v0.4s, v23.4s, v22.4s
        eor     v22.16b, v1.16b, v2.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce418ee6      //sm3tt2b v6.4s, v23.4s, v1.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce419ee6      //sm3tt2b v6.4s, v23.4s, v1.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce41aee6      //sm3tt2b v6.4s, v23.4s, v1.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce41bee6      //sm3tt2b v6.4s, v23.4s, v1.4s[3]
        // s4 = w7  | w8  | w9  | w10
        ext     v1.16b, v3.16b, v4.16b, #12
        // vtmp1 = w3  | w4  | w5  | w6
        ext     v22.16b, v2.16b, v3.16b, #12
        // vtmp2 = w10 | w11 | w12 | w13
        ext     v23.16b, v4.16b, v0.16b, #8
.inst   0xce60c041      //sm3partw1 v1.4s, v2.4s, v0.4s
.inst   0xce76c6e1      //sm3partw2 v1.4s, v23.4s, v22.4s
        eor     v22.16b, v2.16b, v3.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce428ee6      //sm3tt2b v6.4s, v23.4s, v2.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce429ee6      //sm3tt2b v6.4s, v23.4s, v2.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce42aee6      //sm3tt2b v6.4s, v23.4s, v2.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce42bee6      //sm3tt2b v6.4s, v23.4s, v2.4s[3]
        eor     v22.16b, v3.16b, v4.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce438ee6      //sm3tt2b v6.4s, v23.4s, v3.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce439ee6      //sm3tt2b v6.4s, v23.4s, v3.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce43aee6      //sm3tt2b v6.4s, v23.4s, v3.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce43bee6      //sm3tt2b v6.4s, v23.4s, v3.4s[3]
        eor     v22.16b, v4.16b, v0.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce448ee6      //sm3tt2b v6.4s, v23.4s, v4.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce449ee6      //sm3tt2b v6.4s, v23.4s, v4.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce44aee6      //sm3tt2b v6.4s, v23.4s, v4.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce44bee6      //sm3tt2b v6.4s, v23.4s, v4.4s[3]
        eor     v22.16b, v0.16b, v1.16b
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce5686e5      //sm3tt1b v5.4s, v23.4s, v22.4s[0]
.inst   0xce408ee6      //sm3tt2b v6.4s, v23.4s, v0.4s[0]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce5696e5      //sm3tt1b v5.4s, v23.4s, v22.4s[1]
.inst   0xce409ee6      //sm3tt2b v6.4s, v23.4s, v0.4s[1]
.inst   0xce5418b7      //sm3ss1 v23.4s, v5.4s, v20.4s, v6.4s
        shl     v21.4s, v20.4s, #1
        sri     v21.4s, v20.4s, #31
.inst   0xce56a6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[2]
.inst   0xce40aee6      //sm3tt2b v6.4s, v23.4s, v0.4s[2]
.inst   0xce5518b7      //sm3ss1 v23.4s, v5.4s, v21.4s, v6.4s
        shl     v20.4s, v21.4s, #1
        sri     v20.4s, v21.4s, #31
.inst   0xce56b6e5      //sm3tt1b v5.4s, v23.4s, v22.4s[3]
.inst   0xce40bee6      //sm3tt2b v6.4s, v23.4s, v0.4s[3]
        eor     v5.16b, v5.16b, v18.16b
        eor     v6.16b, v6.16b, v19.16b

        // any remained blocks?
        cbnz    w2, .Loop

        // save state
        rev64   v5.4s, v5.4s
        rev64   v6.4s, v6.4s
        ext     v5.16b, v5.16b, v5.16b, #8
        ext     v6.16b, v6.16b, v6.16b, #8
        st1     {v5.4s,v6.4s}, [x0]
        ret
.size   ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order

.align  3
.Tj:
.word   0x79cc4519, 0x9d8a7a87
